#!/usr/bin/python
# coding=utf-8
import win32com.client
import simhash
import textprocess
#config
minWord=50 #只记录该字数以上的段落
class wordprocess:
    def __init__(self,filename):
        # Launch Word
        MSWord = win32com.client.Dispatch("Word.Application")
        MSWord.Visible = 0 
        # Open a specific file
        doc=MSWord.Documents.Open(filename)
        #Get the textual content
        docText = str(MSWord.Documents[0].Content)
        # Get a list of tables
        # self.listTables= MSWord.Documents[0].Tables
        self.docText = docText.split('\r')
        self.docText=list(filter(lambda i: len(i)>minWord,self.docText))
        doc.Close()
        self.shashlist=[filename.split("/")[-1]]
        self._makesimhash()
        
    def _makesimhash(self):       
        tp=textprocess.textprocess()
        for text in self.docText:
            res=tp.fenci(text)
            shash = simhash.simhash(res)
            self.shashlist.append({"text":text,"simhash":shash,"copylist":[]})
'''
if __name__=="__main__":
    demo=wordprocess("X:\\毕业设计\\开题报告-张非凡（0810420238）.doc")

'''
